{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "NLP-Week4-Exercise-Shakespeare-Question.ipynb",
      "version": "0.3.2",
      "provenance": [
        {
          "file_id": "1P_MeOoZuEGgMtvn3yVAQFWDhnuyBzrv1",
          "timestamp": 1559583333688
        },
        {
          "file_id": "1qmXwwCaT07-qviCiBV65lkus_KvwsyQI",
          "timestamp": 1559583256599
        },
        {
          "file_id": "1V60jn23JMcMpwhF-KvCTYIIfm4J2X6v5",
          "timestamp": 1558728367158
        }
      ]
    },
    "kernelspec": {
      "name": "python2",
      "display_name": "Python 2"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "BOwsuGQQY9OL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.models import Sequential\n",
        "from tensorflow.keras.optimizers import Adam\n",
        "### YOUR CODE HERE\n",
        "# Figure out how to import regularizers\n",
        "###\n",
        "import tensorflow.keras.utils as ku \n",
        "import numpy as np "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "PRnDnCW-Z7qv",
        "colab": {}
      },
      "source": [
        "tokenizer = Tokenizer()\n",
        "!wget --no-check-certificate \\\n",
        "    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \\\n",
        "    -O /tmp/sonnets.txt\n",
        "data = open('/tmp/sonnets.txt').read()\n",
        "\n",
        "corpus = data.lower().split(\"\\n\")\n",
        "\n",
        "\n",
        "tokenizer.fit_on_texts(corpus)\n",
        "total_words = len(tokenizer.word_index) + 1\n",
        "\n",
        "# create input sequences using list of tokens\n",
        "input_sequences = []\n",
        "for line in corpus:\n",
        "\ttoken_list = tokenizer.texts_to_sequences([line])[0]\n",
        "\tfor i in range(1, len(token_list)):\n",
        "\t\tn_gram_sequence = token_list[:i+1]\n",
        "\t\tinput_sequences.append(n_gram_sequence)\n",
        "\n",
        "\n",
        "# pad sequences \n",
        "max_sequence_len = max([len(x) for x in input_sequences])\n",
        "input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))\n",
        "\n",
        "# create predictors and label\n",
        "predictors, label = input_sequences[:,:-1],input_sequences[:,-1]\n",
        "\n",
        "label = ku.to_categorical(label, num_classes=total_words)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "w9vH8Y59ajYL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(# Your Embedding Layer)\n",
        "model.add(# An LSTM Layer)\n",
        "model.add(# A dropout layer)\n",
        "model.add(# Another LSTM Layer)\n",
        "model.add(# A Dense Layer including regularizers)\n",
        "model.add(# A Dense Layer)\n",
        "# Pick an optimizer\n",
        "model.compile(# Pick a loss function and an optimizer)\n",
        "print(model.summary())\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AIg2f1HBxqof",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        " history = model.fit(predictors, label, epochs=100, verbose=1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1fXTEO3GJ282",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import matplotlib.pyplot as plt\n",
        "acc = history.history['acc']\n",
        "loss = history.history['loss']\n",
        "\n",
        "epochs = range(len(acc))\n",
        "\n",
        "plt.plot(epochs, acc, 'b', label='Training accuracy')\n",
        "plt.title('Training accuracy')\n",
        "\n",
        "plt.figure()\n",
        "\n",
        "plt.plot(epochs, loss, 'b', label='Training Loss')\n",
        "plt.title('Training loss')\n",
        "plt.legend()\n",
        "\n",
        "plt.show()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6Vc6PHgxa6Hm",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "seed_text = \"Help me Obi Wan Kenobi, you're my only hope\"\n",
        "next_words = 100\n",
        "  \n",
        "for _ in range(next_words):\n",
        "\ttoken_list = tokenizer.texts_to_sequences([seed_text])[0]\n",
        "\ttoken_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')\n",
        "\tpredicted = model.predict_classes(token_list, verbose=0)\n",
        "\toutput_word = \"\"\n",
        "\tfor word, index in tokenizer.word_index.items():\n",
        "\t\tif index == predicted:\n",
        "\t\t\toutput_word = word\n",
        "\t\t\tbreak\n",
        "\tseed_text += \" \" + output_word\n",
        "print(seed_text)"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}